# Set up packages for lecture. Don't worry about understanding this code, but
# make sure to run it if you're following along.
import numpy as np
import babypandas as bpd
import pandas as pd
from matplotlib_inline.backend_inline import set_matplotlib_formats
import matplotlib.pyplot as plt
from scipy import stats
import otter
set_matplotlib_formats("svg")
plt.style.use('ggplot')
np.set_printoptions(threshold=20, precision=2, suppress=True)
pd.set_option("display.max_rows", 7)
pd.set_option("display.max_columns", 8)
pd.set_option("display.precision", 2)
# Setup to start where we left off last time
keep_cols = ['business_name', 'inspection_date', 'inspection_score', 'risk_category', 'Neighborhoods', 'Zip Codes']
restaurants_full = bpd.read_csv('data/restaurants.csv').get(keep_cols)
bakeries = restaurants_full[restaurants_full.get('business_name').str.lower().str.contains('bake')]
bakeries = bakeries[bakeries.get('inspection_score') >= 0] # Keeping only the rows where we know the inspection score
Let's pick up where we left off last lecture.
bakeries
| business_name | inspection_date | inspection_score | risk_category | Neighborhoods | Zip Codes | |
|---|---|---|---|---|---|---|
| 327 | Le Marais Bakery Castro | 2018-08-06T00:00:00.000 | 90.0 | Moderate Risk | NaN | NaN |
| 365 | Pho Luen Fat Bakery & Restaurant | 2019-04-08T00:00:00.000 | 76.0 | Low Risk | NaN | NaN |
| 372 | Brioche Bakery & Cafe | 2019-01-31T00:00:00.000 | 88.0 | Low Risk | NaN | NaN |
| ... | ... | ... | ... | ... | ... | ... |
| 53954 | Fancy Wheatfield Bakery | 2019-03-04T00:00:00.000 | 83.0 | Moderate Risk | NaN | NaN |
| 54102 | New Hollywood Bakery & Restaurant | 2016-08-30T00:00:00.000 | 74.0 | High Risk | NaN | NaN |
| 54171 | Speciality's Cafe and Bakery | 2019-04-29T00:00:00.000 | 89.0 | Moderate Risk | NaN | NaN |
1216 rows × 6 columns
np.random.seed(23) # Ignore this
sample_of_bakeries = bakeries.sample(200) # SOLUTION
sample_of_bakeries
| business_name | inspection_date | inspection_score | risk_category | Neighborhoods | Zip Codes | |
|---|---|---|---|---|---|---|
| 33359 | Universal Bakery Inc. | 2019-01-28T00:00:00.000 | 83.0 | Low Risk | 2.0 | 28859.0 |
| 19980 | Cherry Blossom Bakery 2 | 2016-06-28T00:00:00.000 | 90.0 | Moderate Risk | NaN | NaN |
| 29825 | Waterfront Bakery | 2018-06-07T00:00:00.000 | 94.0 | Low Risk | 32.0 | 308.0 |
| ... | ... | ... | ... | ... | ... | ... |
| 4835 | Marla Bakery | 2018-09-10T00:00:00.000 | 91.0 | High Risk | NaN | NaN |
| 26932 | PRINCESS BAKERY | 2016-08-16T00:00:00.000 | 79.0 | Low Risk | 5.0 | 28861.0 |
| 34201 | Castro Tarts Cafe and Bakery Inc. | 2017-08-23T00:00:00.000 | 82.0 | Low Risk | NaN | NaN |
200 rows × 6 columns
Using a single sample of 200 bakeries, how can we estimate the median inspection score of all bakeries in San Francisco with an inspection score? What technique should we use?
A. Standard hypothesis testing
B. Permutation testing
C. Bootstrapping
D. The Central Limit Theorem
There is no CLT for sample medians, so instead we'll have to resort to bootstrapping to estimate the distribution of the sample median.
Recall, bootstrapping is the act of sampling from the original sample, with replacement. This is also called resampling.
# The median of our original sample – this is just one number
sample_of_bakeries.get('inspection_score').median() # SOLUTION
87.0
# The median of a single bootstrap resample – this is just one number
sample_of_bakeries.sample(200, replace=True).get('inspection_score').median() # SOLUTION
86.5
Let's resample repeatedly.
np.random.seed(23) # Ignore this
boot_medians = np.array([])
# BEGIN SOLUTION
for i in np.arange(5000):
boot_median = sample_of_bakeries.sample(200, replace=True).get('inspection_score').median()
boot_medians = np.append(boot_medians, boot_median)
# END SOLUTION
boot_medians
array([87. , 85. , 86.5, ..., 87.5, 88. , 86. ])
bpd.DataFrame().assign(boot_medians=boot_medians).plot(kind='hist', density=True, ec='w', bins=10, figsize=(10, 5));
Note that this distribution is not at all normal.
To compute a 95% confidence interval, we take the middle 95% of the bootstrapped medians.
# BEGIN SOLUTION
left = np.percentile(boot_medians, 2.5)
right = np.percentile(boot_medians, 97.5)
[left, right]
# END SOLUTION
[85.0, 88.0]
Which of the following interpretations of this confidence interval are valid?
You work as a family physician. You collect data and you find that in 6354 patients, 3115 were children and 3239 were adults.
You want to test the following hypotheses:
Which test statistic(s) could be used for this hypothesis test? Which values of the test statistic point towards the alternative?
A. Proportion of children seen
B. Number of children seen
C. Number of children minus number of adults seen
D. Absolute value of number of children minus number of adults seen
There may be multiple correct answers; choose one.
Let's use option B, the number of children seen, as a test statistic. Small values of this statistic favor the alternative hypothesis.
How do we generate a single value of the test statistic?
np.random.multinomial(6354, [0.5, 0.5])[0] # SOLUTION
3172
As usual, let's simulate the test statistic many, many times.
test_stats = np.array([])
# BEGIN SOLUTION
for i in np.arange(10000):
stat = np.random.multinomial(6354, [0.5, 0.5])[0]
test_stats = np.append(test_stats, stat)
# END SOLUTION
test_stats
array([3204., 3213., 3172., ..., 3150., 3198., 3213.])
bpd.DataFrame().assign(test_stats=test_stats) \
.plot(kind='hist', density=True, ec='w', figsize=(10, 5), bins=20);
plt.axvline(3115, lw=3, color='black', label='observed statistic')
plt.legend();
Recall that you collected data and found that in 6354 patients, 3115 were children and 3239 were adults.
What goes in blank (a)?
p_value = np.count_nonzero(test_stats __(a)__ 3115) / 10000
A. >=
B. >
C. <=
D. <
<=
# Calculate the p-value
What do we do, assuming that we're using a 5% p-value cutoff?
A. Reject the null
B. Fail to reject the null
C. It depends
Note that while we used np.random.multinomial to simulate the test statistic, we could have used np.random.choice, too:
choices = np.random.choice(['adult', 'child'], p=[0.5, 0.5], size=6354, replace=True) # SOLUTION
choices
array(['adult', 'adult', 'adult', ..., 'child', 'child', 'adult'],
dtype='<U5')
np.count_nonzero(choices == 'child') # SOLUTION
3142
Is this an example of bootstrapping?
A. Yes, because we are sampling with replacement.
B. No, this is not bootstrapping.
babypandas code is regular pandas code, too!These sites allow you to search for datasets (in CSV format) from a variety of different domains. Some may require you to sign up for an account; these are generally reputable sources.
Note that all of these links are also available at rampure.org/find-datasets.
Tip: if a site only allows you to download a file as an Excel file, not a CSV file, you can download it, open it in a spreadsheet viewer (Excel, Numbers, Google Sheets), and export it to a CSV.
The Data Science Student Society’s Projects Committee has opened its applications for the Winter 2023 cohort!
Apply here by Sunday at 11:59pm (ignore the deadline on the form). Contact ds3projects@gmail.com with questions.
plotly¶matplotlib.df.plot.plotly is a different visualization library that allows us to create interactive visualizations.import plotly.express as px
Gapminder Foundation is a non-profit venture registered in Stockholm, Sweden, that promotes sustainable global development and achievement of the United Nations Millennium Development Goals by increased use and understanding of statistics and other information about social, economic and environmental development at local, national and global levels. - Gapminder Wikipedia
gapminder = px.data.gapminder()
gapminder
| country | continent | year | lifeExp | pop | gdpPercap | iso_alpha | iso_num | |
|---|---|---|---|---|---|---|---|---|
| 0 | Afghanistan | Asia | 1952 | 28.80 | 8425333 | 779.45 | AFG | 4 |
| 1 | Afghanistan | Asia | 1957 | 30.33 | 9240934 | 820.85 | AFG | 4 |
| 2 | Afghanistan | Asia | 1962 | 32.00 | 10267083 | 853.10 | AFG | 4 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 1701 | Zimbabwe | Africa | 1997 | 46.81 | 11404948 | 792.45 | ZWE | 716 |
| 1702 | Zimbabwe | Africa | 2002 | 39.99 | 11926563 | 672.04 | ZWE | 716 |
| 1703 | Zimbabwe | Africa | 2007 | 43.49 | 12311143 | 469.71 | ZWE | 716 |
1704 rows × 8 columns
The dataset contains information for each country for several different years.
gapminder.get('year').unique()
array([1952, 1957, 1962, 1967, 1972, 1977, 1982, 1987, 1992, 1997, 2002,
2007])
Let's start by just looking at 2007 data (the most recent year in the dataset).
gapminder_2007 = gapminder[gapminder.get('year') == 2007]
gapminder_2007
| country | continent | year | lifeExp | pop | gdpPercap | iso_alpha | iso_num | |
|---|---|---|---|---|---|---|---|---|
| 11 | Afghanistan | Asia | 2007 | 43.83 | 31889923 | 974.58 | AFG | 4 |
| 23 | Albania | Europe | 2007 | 76.42 | 3600523 | 5937.03 | ALB | 8 |
| 35 | Algeria | Africa | 2007 | 72.30 | 33333216 | 6223.37 | DZA | 12 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 1679 | Yemen, Rep. | Asia | 2007 | 62.70 | 22211743 | 2280.77 | YEM | 887 |
| 1691 | Zambia | Africa | 2007 | 42.38 | 11746035 | 1271.21 | ZMB | 894 |
| 1703 | Zimbabwe | Africa | 2007 | 43.49 | 12311143 | 469.71 | ZWE | 716 |
142 rows × 8 columns
We can plot life expectancy vs. GDP per capita. If you hover over a point, you will see the name of the country.
px.scatter(gapminder_2007, x='gdpPercap', y='lifeExp', hover_name='country')
In future courses, you'll learn about transformations. Here, we'll apply a log transformation to the x-axis to make the plot look a little more linear.
px.scatter(gapminder_2007, x='gdpPercap', y='lifeExp', log_x=True, hover_name='country')
We can take things one step further.
px.scatter(gapminder,
x = 'gdpPercap',
y = 'lifeExp',
hover_name = 'country',
color = 'continent',
size = 'pop',
size_max = 60,
log_x = True,
range_y = [30, 90],
animation_frame = 'year',
title = 'Life Expectancy, GDP Per Capita, and Population over Time'
)
Watch this video if you want to see an even-more-animated version of this plot.
px.histogram(gapminder,
x = 'lifeExp',
animation_frame = 'year',
range_x = [20, 90],
range_y = [0, 50],
title = 'Distribution of Life Expectancy over Time')
px.choropleth(gapminder,
locations = 'iso_alpha',
color = 'lifeExp',
hover_name = 'country',
hover_data = {'iso_alpha': False},
title = 'Life Expectancy Per Country',
color_continuous_scale = px.colors.sequential.tempo
)
Data science is about drawing useful conclusions from data using computation. Throughout the quarter, we touched on several aspects of data science:
Suraj's freshman year transcript.Don't let your grades define you, they don't tell the full story.
Adjusting to life in college can be challenging, particularly because it can be hard to manage your time wisely.

If you're interested, register to attend this workshop on overcoming procrastination taught by another data science professor next quarter – participants each receive a $50 Amazon gift card!
This course would not have been possible without...